

setwd(base_dir)

modeltype <- "simplified"

source("./auxfiles/loaddata.R")

setwd(base_dir)

theme_set(theme_bw())
theme_update(text= element_text(size=22,color='black'),axis.text.y = element_text(color='black',size=14),
             axis.text.x = element_text(color='black',angle = 0, hjust = 0.5,size=14),
             plot.title = element_text(size=22,color='black',hjust=0.5), panel.grid.major=element_blank(),
             legend.background=element_blank(),
             panel.grid.minor=element_blank())

maindir<-getwd()

setwd("./intermediary")
typevals<-as.data.table(read.dta13("sp_temp_types.dta"))
setwd(maindir)


#plot scatter of mean wage on x, by type on y
#with normal variance, no variance
#setwd('/home/mdkellogg/StructuralModel/figures/kmeans')

#setwd('/fp/homes01/u01/ec-maxwellk/DynamicDI/harmonized_code/modeloutput/kmeans')
setwd("./modeloutput/kmeans")



cleansim<-function(simdata,select,selectC){
  simdata<-as.data.table(simdata)
  simdata[,Health:=Health+1]
  simdata[,spHealth:=spHealth+1]
  
  print(simdata[C<=0])
  simdata[,row:=1:nrow(simdata)]
  simdata[,Ceq:=C/oecd[[Type+1]][time+1,single+1],by=row]
  simdata[,age:=floor(time/params$agelength+23)]
  simdata[,ageold:=age>=45*params$agelength]
  simdata[,yearcons:=sum(Ceq),by=.(age,i,Type)]
  simdata[,yearDI:=DI==2 & age<=62*params$agelength]
  
  simdata[,select:=select]
  simdata[,selectC:=selectC]
  simdata[select==1,enterDI:=yearDI==1&c(NA,yearDI[-.N])==0&time<62*params$agelength,by=i]
  simdata[select==1,LastDI:=c(NA,yearDI[-.N])==1&time<62*params$agelength,by=i]
  simdata[,ApplyLast:=c(NA,Apply[-.N]),by=i]
  simdata[,RejectedLast:=c(0,Rejected[-.N]),by=i]
  simdata[,HealthLast:=c(NA,Health[-.N]),by=i]
  simdata[select==1&yearDI==1&enterDI!=1,enterDI:=NA]
  simdata[,Type:=Type+1]
}

evalkmean<-function(#wagevar,
                    select,
                    selectC,
#                    wagemult_shock=0,
#                    spwagemult_shock=0,
#                    het_healthrisk=0,het_workdis=1,het_workcost=1,
#                    het_wagereg=1,
                    plotname){
  

  simdata<-Valsim_solve(as.vector(param_input),params,healthprob, marriageprob,foodstamps,numkids,oecd,aperm(Asset),aperm(Wage), aperm(spWage),
                        shocks.wagereal,shocks.spwagereal,shocks.health,shocks.allow,shocks.reassess,shocks.jobloss,shocks.spjobloss, shocks.marriage,
                        marriageprob_init,numsims,lumpsum=0,print=0,workDI=0,onlyFT=1)  
  
simdata<-cleansim(simdata,select=select,selectC=selectC)
simdata[Health==1,healthyemp:= Work == 1]  
simdata[,Workyears:=sum((Work+Workpart)*select),by=i]
simdata[,everhealthywork := max(healthyemp,na.rm=TRUE),by=i]
simdata[,insample:=everhealthywork == 1 & Workyears >= 3]
simdata[,lw:=log(Wage) + shocks.wagenoise]
simdata[insample == 1& Health ==1,healthyempres:=felm(healthyemp ~ 1 | age,data=simdata[insample ==1& Health ==1,])$residuals]
simdata[insample == 1&Health==1,healthyempres:=healthyempres/(var(healthyempres)^0.5),by=time]
simdata[insample == 1&Work==1,lwres:=felm(lw ~ 1 | age,data=simdata[insample==1&Work==1,])$residuals]
simdata[insample == 1&Work==1,lwres:=lwres/(var(lwres)^0.5),by=time]
simdata[insample == 1,meanlwres:=mean(lwres,na.rm=TRUE),by=i]
simdata[insample == 1,meanhealthyempres:=mean(healthyempres,na.rm=TRUE),by=i]
pdata<-simdata[insample == 1,.(meanlwres=mean(lwres,na.rm=TRUE),meanhealthyempres=mean(healthyempres,na.rm=TRUE)),by=.(i,Type)]
pdata[!is.na(meanlwres) & !is.na(meanhealthyempres),Typehat:=kmeans(cbind(meanlwres,meanhealthyempres),centers=cbind(quantile(pdata$meanlwres,probs=c(0.25,0.5,0.75),na.rm=TRUE),quantile(pdata$meanhealthyempres,probs=c(0.25,0.5,0.75),na.rm=TRUE)),iter.max=100)$cluster]
pdata[! is.na(meanlwres) & !is.na(meanhealthyempres),Typehat_fine:=kmeans(cbind(meanlwres,meanhealthyempres),centers=cbind(quantile(pdata$meanlwres,probs=c(0.125,0.25,0.375,0.5,0.625,0.75),na.rm=TRUE),quantile(pdata$meanhealthyempres,probs=c(0.125,0.25,0.375,0.5,0.625,0.75),na.rm=TRUE)),iter.max=100)$cluster]

#Probabilities by ventiles:
lwquantiles <- pdata[!is.na(meanlwres) & !is.na(meanhealthyempres),quantile(meanlwres,probs=seq(from=0,to=0.95,length.out=20),na.rm=TRUE)]
lwquantiles[1]<- -Inf
empquantiles <- pdata[!is.na(meanlwres) & !is.na(meanhealthyempres),quantile(meanhealthyempres,probs=seq(from=0,to=0.95,length.out=20),na.rm=TRUE)]
empquantiles[1] <- -Inf

pdata[!is.na(meanlwres) & !is.na(meanhealthyempres),q_lw := max(which(meanlwres > lwquantiles)),by=.(i,Type)]
pdata[!is.na(meanlwres) & !is.na(meanhealthyempres),q_emp := max(which(meanhealthyempres > empquantiles)),by=.(i,Type)]
#Prob that a person in each bin belongs to each type:
type_probs <- pdata[,.(p_type1 = mean(Type==1),
                       p_type2 = mean(Type==2),
                       p_type3 = mean(Type==3)),by=.(q_lw,q_emp)]
#DEFINE k-means by posterior probability here. Load in typehats and meanlwres's.
#nonparametrically estimate type-specific probabilities this way:
#STEP 1: get the type-specific joint densities with wage  P(W|T)*P(T)
#assign a person the type that has the highest value for them.
typevals[!is.na(lw_res) & !is.na(healthyemp_res),q_lw := max(which(lw_res > lwquantiles)),by=.(newid)]
typevals[!is.na(lw_res) & !is.na(healthyemp_res),q_emp := max(which(healthyemp_res > empquantiles)),by=.(newid)]
newtypes<-merge(typevals,type_probs,by=c("q_lw","q_emp"),all.x=TRUE)
newtypes[,p_max := pmax(p_type1,p_type2,p_type3)]
newtypes[p_max==p_type1,type_prodpost:=1]
newtypes[p_max==p_type2,type_prodpost:=2]
newtypes[p_max==p_type3,type_prodpost:=3]
 
# for(typ in 1:numtypes){
#   typevals[!is.na(lw_res) & !is.na(healthyemp_res),eval(paste0("prob_",typ)):=kde2d(x = pdata[! is.na(meanlwres) & !is.na(meanhealthyempres) & Type==typ,meanlwres],
#                                                                                       y = pdata[! is.na(meanlwres) & !is.na(meanhealthyempres) & Type==typ,meanhealthyempres],
#                                                                                       lims = c(lw_res,lw_res,healthyemp_res,healthyemp_res)
#                                                                                       ,n=1)$z[1,1]*(numsims[typ]/sum(numsims)),by=newid]
# #  typevals[!is.na(lw_res) & !is.na(healthyemp_res),eval(paste0("prob_",typ)):=density(simdata[! is.na(meanlwres) & !is.na(meanhealthyempres) & Type==typ,lwres],from=lw_res,to=lw_res,n=1)$y*(numsims[typ]/sum(numsims)),by=newid]
#   typevals[is.na(lw_res) & !is.na(healthyemp_res),eval(paste0("prob_",typ)):=pdata[is.na(meanlwres) | is.na(meanhealthyempres),mean(Type==typ)]]
# }
# typevals[,maxprob_post:=0]
# for(typ in 1:numtypes){
#   typevals[,maxprob_post:=max(get(paste0("prob_",typ)),maxprob_post),by=newid]
#   typevals[get(paste0("prob_",typ))==maxprob_post,type_prodpost:=typ]
# }
# typevals[,maxprob_post:=NULL]

save.dta13(newtypes,paste0("./sp_temp_types_post",plotname,".dta"))

saveRDS(newtypes[,table(type_stationary,type_prodpost, useNA = "always")],paste0("kmeansim_postperformance",plotname,".RDS"))

typedensity<-newtypes[!is.na(lw_res),][order(lw_res),]
typedensity[,row:=1]
typedensity[,count:=cumsum(row)]
typedensity[,count:=max(count),by=lw_res]
typedensity[,y:=count/nrow(typedensity[!is.na(lw_res),])]
typedensity[,x:=lw_res]

pdf(paste0("kmeansim_postperformance",plotname,".pdf"))
print(ggplot()
      +geom_ribbon(data=typedensity,
                   aes(ymin=0,ymax=y,x=x,fill=as.factor(type_prodpost)), alpha=0.25)
      +geom_step(data=typedensity[order(x),],
                 aes(y=y,x=x),linetype="dashed")
      + geom_ribbon(data=typedensity[type_stationary==1,],
                    aes(x=x,ymax=y,ymin=0,
                        color="1"), size = 1, alpha =0)
      + geom_ribbon(data=typedensity[type_stationary==3,],
                    aes(x=x,ymax=y,ymin=0,
                        color="3"), size = 1, alpha =0)
      + geom_ribbon(data=typedensity[type_stationary==2,],
                    aes(x=x,ymax=y,ymin=0,
                        color="2"), size = 1, alpha =0)
      #      +scale_y_continuous(breaks=c(0,0.25,0.5,0.75,1),limits=c(-0.1,1))
      +guides(color=FALSE)
      +theme(legend.position="bottom",  legend.box="vertical")
      +labs(y="Density",x="Mean Detrended Log Wage",fill="Type",color="Type")
)



groupdata<-data.table(table(newtypes[,max(type_stationary),by=newid]$V1,
                            newtypes[,max(type_prodpost),by=newid]$V1, useNA="ifany"))
names(groupdata)<-c("Type_stationary","Type_prodpost","N")
groupdata[,totN:=sum(N),by=Type_stationary]
groupdata[,share:=N/totN]
groupdata[Type_stationary==Type_prodpost,group:="No Change in Type"]
groupdata[Type_stationary!=Type_prodpost,group:="Moved to Mid-Type"]

groupdata<-groupdata[!is.na(Type_stationary)&!is.na(Type_prodpost),]

print(ggplot(data=groupdata[,])
      +geom_bar(aes(y=share,x=Type_stationary,fill=factor(Type_prodpost)),position="stack",stat="identity")
      +coord_cartesian(ylim=c(0,1))
      +labs(y="Proportion of Households",x="Type",fill="")
#      +scale_fill_manual(values= c("No Change in Type"="gray0","Moved to Mid-Type"="gray50"), drop=FALSE)
      +theme(legend.position="bottom",  legend.box="vertical")
)

print(ggplot(data=groupdata[,])
      +geom_bar(aes(y=share,x=Type_stationary,fill=factor(group,levels=c("Moved to Mid-Type","No Change in Type"))),position="stack",stat="identity")
      +coord_cartesian(ylim=c(0,1))
      +labs(y="Proportion of Households",x="Type",fill="")
      +scale_fill_manual(values= c("No Change in Type"="gray0","Moved to Mid-Type"="gray50"), drop=FALSE)
      +theme(legend.position="bottom",  legend.box="vertical")
)
dev.off()

return(simdata)
}


#estimated variance:
evalkmean(select=select,
          selectC=selectC,
          #wagevar=wagecov,
          #wagemult_shock=0,
          #spwagemult_shock=0,
          #het_healthrisk = 0, het_workdis = 1, het_workcost=1,
          plotname="simplified")


